# 将豆瓣读书网站上的所有图书信息爬取下来，并存储到MySQL数据库中。
# 爬取信息字段要求：[ID号、书名、作者、出版社、原作名、译者、出版年、页数、定价、装帧、丛书、ISBN、评分、评论人数]
# 步骤3：根据Redis数据库的所有图书详情页URL信息爬取豆瓣读书网站上的图书详情页，获取图书信息并存储到Redis数据库
# 第十一周作业
# 班级：Python五期
# 学员：李子坚

# -*- coding: utf-8 -*-
import scrapy,re
from bookinfo.items import BookItem
from scrapy_redis.spiders import RedisSpider

class BookSpider(RedisSpider):
    '''爬取豆瓣读书网站上所有图书详情页的图书信息爬虫类'''
    name = 'book'
    #allowed_domains = ['book.douban.com']
    #start_urls = ['http://book.douban.com/']
    redis_key = "BookSpider:start_urls"		#存储爬取图书详情页URL的REDIS KEY

    def __init__(self, *args, **kwargs):
        '''初始化爬虫类属性'''
        #动态定义允许的域列表
        domain = kwargs.pop('domain', '')
        self.allowed_domains = filter(None, domain.split(','))
        super(BookSpider, self).__init__(*args, **kwargs)
        
    def parse(self, response):
        '''解析图书详情页中的图书信息'''
        #如果爬取不成功，重新发起请求
        if response.status != 200:
            yield scrapy.Request(url=response.url, callback=self.parse, dont_filter=True)

        item = BookItem()

        item['ID'] = re.findall('book.douban.com/subject/([0-9]+)/', response.url)[0]							#提取图书ID
        item['title'] = response.css("h1>span::text").extract_first().strip()									#提取图书书名

        #获取图书信息节点
        info = response.css("div#info")
        info_text = info.extract_first()

        authors = re.findall('<span class="pl">[\s]*作者.*?</span>(.*?)<br>', info_text, re.S)					#提取作者
        #authors = info.re('<span.*?作者.*?</span>(.*?)<br>', re.S)
        if authors:
            author_list = re.findall('<a.*?>[\s]*(.*?)[\s]*</a>', authors[0], re.S)
            if author_list:
                item['author'] = "/".join(author_list)
            else:
                item['author'] = ""
        else:
            item['author'] = ""

        press = info.re('<span class="pl">[\s]*出版社.*?</span>(.*?)<br>', re.S)                                #提取出版社
        if press:
            item['press'] = press[0].strip()
        else:
            item['press'] = ""

        org_name = info.re('<span class="pl">[\s]*原作名.*?</span>(.*?)<br>', re.S)								#提取原作名
        if org_name:
            item['org_name'] = org_name[0].strip()
        else:
            item['org_name'] = ""
        
        translators = re.findall('<span class="pl">[\s]*译者.*?</span>(.*?)<br>', info_text, re.S)				#提取译者
        if translators:
            translator_list = re.findall('<a.*?>[\s]*(.*?)[\s]*</a>', translators[0], re.S)
            if translator_list:
                item['translator'] = "/".join(translator_list)
            else:
                item['translator'] = ""
        else:
            item['translator'] = ""

        pub_date = info.re('<span class="pl">[\s]*出版年.*?</span>(.*?)<br>', re.S)		                        #提取出版年
        if pub_date:
            item['pub_date'] = pub_date[0].strip()
        else:
            item['pub_date'] = ""

        pages_num = info.re('<span class="pl">[\s]*页数.*?</span>[\s]*([0-9]+)[\s]*<br>', re.S)	                #提取页数
        if pages_num:
            item['pages_num'] = pages_num[0]
        else:
            item['pages_num'] = ""

        price = info.re('<span class="pl">[\s]*定价.*?</span>(.*?)<br>', re.S)									#提取定价
        if price:
            item['price'] = price[0].strip()
        else:
            item['price'] = ""

        binding = info.re('<span class="pl">[\s]*装帧.*?</span>(.*?)<br>', re.S)								#提取装帧
        if binding:
            item['binding'] = binding[0].strip()
        else:
            item['binding'] = ""

        series = info.re('<span class="pl">[\s]*丛书.*?</span>.*?<a.*?>(.*?)</a>', re.S)						#提取丛书
        if series:
            item['series'] = series[0].strip()
        else:
            item['series'] = ""

        ISBN = info.re('<span class="pl">[\s]*ISBN.*?</span>(.*?)<br>', re.S)									#提取ISBN
        if ISBN:
            pass
        else:
            ISBN = info.re('<span class="pl">[\s]*统一书号.*?</span>(.*?)<br>', re.S)
        if ISBN:
            item['ISBN'] = ISBN[0].strip()
        else:
            item['ISBN'] = ""

        #获取图书评论区节点
        rating = response.css("div#interest_sectl")
        item['score'] = rating.css("strong.ll.rating_num::text").extract_first().strip()						#提取评分
        comment_num = rating.xpath(".//span[@property='v:votes']/text()").extract_first()						#提取评论人数
        if comment_num:
            item['comment_num'] = comment_num.strip()
        else:
            item['comment_num'] = ""

        #print(item)
        yield item
